full analysis

In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
In [84]:
df = pickle.load (open("pj_df_full.20190629_095112.pkl", "rb"))      # cases corpus
dfmo = pickle.load (open("posp/pj_demo_dfmo_full.20190629_163240.pkl", "rb"))   # political speeches dataset
dfre = pickle.load(open("re/pj_dfre_full.20190629_115819.pkl", "rb"))   # reuters dataset
dftw = pickle.load(open("tw/pj_dftw_full.20190630_005652.pkl", "rb"))   # twitter dataset
In [85]:
plt.hist(df["antpast"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antpast"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antpast"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antpast"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("POS tagging, fraction of verbs in past tense ")
plt.legend(loc='upper right')
plt.show()
In [87]:
plt.hist(df["antpresent"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antpresent"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antpresent"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antpresent"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("POS tagging, fraction of verbs in present tense ")
plt.legend(loc='upper right')
plt.show()
In [92]:
plt.hist(df["antfuture"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antfuture"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antfuture"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antfuture"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("POS tagging, fraction of verbs in future tense ")
plt.legend(loc='upper right')
plt.show()
In [93]:
plt.hist(df["antfpast"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antfpast"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antfpast"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antfpast"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("LIWC, fraction of focus past")
plt.legend(loc='upper right')
plt.show()
In [94]:
plt.hist(df["antfpresent"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antfpresent"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antfpresent"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antfpresent"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("LIWC, fraction of focus present")
plt.legend(loc='upper right')
plt.show()
In [95]:
plt.hist(df["antffuture"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antffuture"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antffuture"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antffuture"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("LIWC, fraction of focus future")
plt.legend(loc='upper right')
plt.show()
In [96]:
plt.hist(df["antpast"], 50, density=1, facecolor = 'blue', alpha = 0.75, label='past tense')
plt.hist(df["antpresent"], 50, density=1, facecolor = 'orange', alpha = 0.75, label = 'present tense')
plt.hist(df["antfuture"], 50, density=1, facecolor = 'green', alpha = 0.75, label = 'future tense')
plt.title("POS tagging, all cases in the cases corpus")
plt.legend(loc='best')
plt.show()
In [97]:
plt.hist(df["antfpast"], 50, density=1, facecolor = 'blue', alpha = 0.75, label='past focus')
plt.hist(df["antfpresent"], 50, density=1, facecolor = 'orange', alpha = 0.75, label='present focus')
plt.hist(df["antffuture"], 50, density=1, facecolor = 'green', alpha = 0.75, label='future focus')
plt.title("LIWC, all cases in the cases corpus")
plt.legend(loc='best')
plt.show()

POS tagging: the fractions of past, present, future tenses, all parties

In [98]:
spas = df["npast"].sum()			# spacy 
sprs = df["npresent"].sum()
sfus = df["nfuture"].sum()

sAvepast, sAvepresent, sAvezfuture = [spas, sprs, sfus]/(spas + sprs + sfus)

LIWC, the fractions of past focus, present focus and future focus, all parties

In [99]:
lpas = df["nfpast"].sum()		# LIWC         
lprs = df["nfpresent"].sum()
lfus = df["nffuture"].sum()

lAvepast, lAvepresent, lAvezfuture = [lpas, lprs, lfus]/(lpas + lprs + lfus)
excerpt from scipy.stats.ttest_ind :

We can use this test, if we observe two independent samples from the same or different population. The test measures whether the average (expected) value differs significantly across samples. If we observe a large p-value, for example larger than 0.05 or 0.1, then we cannot reject the null hypothesis of identical average. If the p-value is smaller than the threshold, e.g. 1% , 5% or 10%, then we reject the null hypothesis of equal averages. The two samples does not need to have the same length.

excerpt from scipy.chisquare:

Tests the null hypothesis that the categorical data has the given frequencies. (f_obs, f_exp, ddof = 0, axis = 0) The p-value is computed using a chi-squared distribution with k - 1- ddof (degrees of freedom), where k = the number of observed frequencies.

In [100]:
from scipy.stats import ttest_ind    # ttest_ind tests the equality of means
from scipy.stats import chisquare    # chisquare tests the independence of 2 distributions
In [101]:
# calculate the proportions of past, present and future tenses averaged across the entire corpus
# see above                 POS tagging
print(sAvepast, sAvepresent, sAvezfuture)
0.48633199147164446 0.4944106035785858 0.019257404949769755
In [102]:
# chisquared test   observed data vs expected (=uniformly distributed) data   POS tagging  entire cases corpus
#chisquare([100*sAvepast, 100*sAvepresent, 100*sAvefuture], [33, 33, 34])
chisquare([49, 49, 2], [33, 33, 34])
# high test statistic and low p-value indicate that the observed distribution is unequal to the expected, uniform distribution
# hence the use of past, present and future tenses in the case corpus is deliberate
Out[102]:
Power_divergenceResult(statistic=45.632798573975045, pvalue=1.2330017493455464e-10)
In [103]:
# calculate the proportions of past, present and future focus averaged across the entire corpus
# see above                   LIWC
print(lAvepast, lAvepresent, lAvezfuture)
0.40126752767022084 0.5258992993042367 0.07283317302554237
In [104]:
# chisquared test   observed data vs expected (=uniformly distributed) data   LIWC  entire cases corpus
chisquare([100*lAvepast, 100*lAvepresent, 100*lAvezfuture], [33, 33, 34])
# high test statistic and low p-value indicate that the observed distribution is unequal to the expected, uniform distribution
# hence the use of past, present and future focus in the case corpus is deliberate
Out[104]:
Power_divergenceResult(statistic=34.1619251897572, pvalue=3.817967256095073e-08)

political speeches dataset

In [105]:
spas = dfmo[ 'npast'].sum()
sprs = dfmo[ 'npresent'].sum()
sfus = dfmo[ 'nfuture'].sum()

sAvepast, sAvepresent, sAvefuture = [spas, sprs, sfus]/(spas + sprs + sfus)

lpas = dfmo[ 'nfpast'].sum()
lprs = dfmo[ 'nfpresent'].sum()
lfus = dfmo[ 'nffuture'].sum()

lAvepast, lAvepresent, lAvefuture = [lpas, lprs, lfus]/(lpas + lprs + lfus)
In [106]:
print("political speeches, both speakers POS ", sAvepast, sAvepresent, sAvefuture)
political speeches, both speakers POS  0.2169448828940865 0.6771416272061989 0.10591348989971461
In [107]:
chisquare([22, 68, 10], [33, 33, 34])
# test result indicates POS past/present/future tenses distribution not random
Out[107]:
Power_divergenceResult(statistic=57.72905525846703, pvalue=2.9126944473126836e-13)
In [108]:
print("political speeches, both speakers LIWC ", lAvepast, lAvepresent, lAvefuture)
political speeches, both speakers LIWC  0.20449907805777504 0.6728826060233559 0.12261831591886908
In [109]:
chisquare([20, 67, 13], [33, 33, 34])
# test result indicates LIWC past/present/future focus distribution not random
Out[109]:
Power_divergenceResult(statistic=53.12210338680927, pvalue=2.9152903563929526e-12)

reuters data set

In [110]:
spa = dfre["npast"].sum()
spr = dfre["npresent"].sum()
sfu = dfre["nfuture"].sum()
sAvepast = spa /(spa + spr + sfu)
sAvepresent = spr / (spa + spr + sfu)
sAvezfuture = sfu / (spa + spr + sfu)

lpa = dfre["nfpast"].sum()
lpr = dfre["nfpresent"].sum()
lfu = dfre["nffuture"].sum()
lAvepast = lpa/ (lpa + lpr + lfu)
lAvepresent = lpr/ (lpa + lpr + lfu)
lAvezfuture = lfu/ (lpa + lpr + lfu)
In [111]:
modfplot = pd.DataFrame({'Avepast' : [lAvepast, sAvepast], 'Avepresent': [lAvepresent, sAvepresent], 'Avezfuture': [lAvezfuture, sAvezfuture]}, index =
 ['LIWC', 'Spacy'] )
modfplot.plot.bar(rot=0)
plt.show()
In [112]:
print("POS:", sAvepast, sAvepresent, sAvezfuture, "   LIWC:", lAvepast, lAvepresent, lAvezfuture)
POS: 0.5968970180058358 0.3497082058216497 0.05339477617251441    LIWC: 0.4841547528017857 0.40684786554518 0.10899738165303427
In [113]:
# LIWC, reuters, past focus, present focus, future focus
chisquare([48, 41, 11], [33, 33, 34])
# large t statistics and small p value indicate the observed distribution is not uniform
Out[113]:
Power_divergenceResult(statistic=24.316399286987522, pvalue=5.245187106240977e-06)
In [114]:
# POS tagging, reuters, past tense, present tense, future tense
chisquare([60, 35, 5], [33, 33, 34])
# large t statistics and small p value indicate the observed distribution is not uniform
Out[114]:
Power_divergenceResult(statistic=46.94741532976827, pvalue=6.389962233044912e-11)

spacy vs LIWC

cases corpus

In [115]:
spas = df["npast"].sum()			# spacy 
sprs = df["npresent"].sum()
sfus = df["nfuture"].sum()

sAvepast, sAvepresent, sAvezfuture = [spas, sprs, sfus]/(spas + sprs + sfus)
lpas = df["nfpast"].sum()		# LIWC         
lprs = df["nfpresent"].sum()
lfus = df["nffuture"].sum()

lAvepast, lAvepresent, lAvezfuture = [lpas, lprs, lfus]/(lpas + lprs + lfus)
In [116]:
modfplot = pd.DataFrame({'Avepast' : [lAvepast, sAvepast], 'Avepresent': [lAvepresent, sAvepresent], 'Avezfuture': [lAvezfuture, sAvezfuture]}, index =
 ['LIWC', 'Spacy'] )
modfplot.plot.bar(rot=0)
plt.legend(loc='upper center')
plt.show()
In [35]:
# POS tagging    past, present, future over the years
In [117]:
import matplotlib.pyplot as plt
axpast = df[df["year"] != 0 ].groupby('year')['antpast'].mean().plot(color='red', label='past tense')
axpresent = df[df["year"] != 0].groupby('year')['antpresent'].mean().plot(color='green', label='present tense')
axfuture = df[df["year"] !=0].groupby('year')['antfuture'].mean().plot(color='blue', label='future tense')
plt.legend(loc='best')
plt.show()	
In [122]:
pasttense_before1970 = df [df["year"] < 1970]["antpast"]
pasttense_after1970 = df [df["year"] >= 1970]["antpast"]
In [123]:
# indicates means before 1970 statististically significantly differs from after 1970
ttest_ind(pasttense_before1970, pasttense_after1970)
Out[123]:
Ttest_indResult(statistic=9.648791508483363, pvalue=7.27435956407142e-22)
In [120]:
#  LIWC    past, present, future focus over the years
In [121]:
import matplotlib.pyplot as plt
axpast = df[df["year"] != 0 ].groupby('year')['antfpast'].mean().plot(color='red', label='past focus')
axpresent = df[df["year"] != 0].groupby('year')['antfpresent'].mean().plot(color='green', label='present focus')
axfuture = df[df["year"] !=0].groupby('year')['antffuture'].mean().plot(color='blue', label='future focus')
plt.legend(loc='best')
plt.show()	
In [124]:
pastfocus_before1970 = df [df["year"] < 1970]["antfpast"]
pastfocus_after1970 = df [df["year"] >= 1970]["antfpast"]
In [125]:
# indicates means before 1970 statististically significantly differs from after 1970
ttest_ind(pastfocus_before1970, pastfocus_after1970)
Out[125]:
Ttest_indResult(statistic=10.291460917234133, pvalue=1.256953354749462e-24)
In [39]:
# LIWC  antfpast, antfpresent, antffuture   vs  POS tagging antpast, antpresent, antfuture
In [126]:
ttest_ind(df["antfpast"], df["antpast"])
# high t-statistic and low p-value indicate the mean of past tenses (POS) and past focus (LIWC) are not equal
Out[126]:
Ttest_indResult(statistic=-28.8494721330526, pvalue=9.031728047365847e-177)
In [127]:
ttest_ind(df["antfpresent"], df["antpresent"])  
# high t-statistic and low p-value indicate the mean of present tenses (POS) and present focus (LIWC) are not equal
Out[127]:
Ttest_indResult(statistic=16.251280671509075, pvalue=9.823219810691702e-59)
In [128]:
ttest_ind(df["antffuture"], df["antfuture"])
# high t-statistic and low p-value indicate the mean of future tenses (POS) and future focus (LIWC) are not equal
Out[128]:
Ttest_indResult(statistic=57.424724422704024, pvalue=0.0)

political speeches dataset

In [129]:
# POS tagging vs LIWC   past     both speakers
ttest_ind(dfmo['antpast'], dfmo['antfpast'], equal_var=False, nan_policy='omit')
# result indicates equal means found by POS tagging and LIWC
Out[129]:
Ttest_indResult(statistic=1.2045528280701834, pvalue=0.22897120869447626)
In [130]:
# POS tagging vs LIWC   present     both speakers
ttest_ind(dfmo['antpresent'], dfmo['antfpresent'], equal_var=False, nan_policy='omit')
# result indicates equal means found by POS tagging and LIWC
Out[130]:
Ttest_indResult(statistic=0.9134598376970914, pvalue=0.36146064662445687)
In [131]:
# POS tagging vs LIWC   future    both speakers
ttest_ind(dfmo['antfuture'], dfmo['antffuture'], equal_var=False, nan_policy='omit')
# result indicates unequal means found by POS tagging and LIWC
Out[131]:
Ttest_indResult(statistic=-3.5484293598562964, pvalue=0.0004249840569410666)

reuters data set

In [132]:
spa = dfre["npast"].sum()
spr = dfre["npresent"].sum()
sfu = dfre["nfuture"].sum()
sAvepast = spa /(spa + spr + sfu)
sAvepresent = spr / (spa + spr + sfu)
sAvezfuture = sfu / (spa + spr + sfu)

lpa = dfre["nfpast"].sum()
lpr = dfre["nfpresent"].sum()
lfu = dfre["nffuture"].sum()
lAvepast = lpa/ (lpa + lpr + lfu)
lAvepresent = lpr/ (lpa + lpr + lfu)
lAvezfuture = lfu/ (lpa + lpr + lfu)
In [133]:
modfplot = pd.DataFrame({'Avepast' : [lAvepast, sAvepast], 'Avepresent': [lAvepresent, sAvepresent], 'Avezfuture': [lAvezfuture, sAvezfuture]}, index =
 ['LIWC', 'Spacy'] )
modfplot.plot.bar(rot=0)
plt.show()

twitter data set

In [134]:
spa = dftw["npast"].sum()
spr = dftw["npresent"].sum()
sfu = dftw["nfuture"].sum()
sAvepast = spa /(spa + spr + sfu)
sAvepresent = spr / (spa + spr + sfu)
sAvezfuture = sfu / (spa + spr + sfu)

lpa = dftw["nfpast"].sum()
lpr = dftw["nfpresent"].sum()
lfu = dftw["nffuture"].sum()
lAvepast = lpa/ (lpa + lpr + lfu)
lAvepresent = lpr/ (lpa + lpr + lfu)
lAvezfuture = lfu/ (lpa + lpr + lfu)
In [135]:
modfplot = pd.DataFrame({'Avepast' : [lAvepast, sAvepast], 'Avepresent': [lAvepresent, sAvepresent], 'Avezfuture': [lAvezfuture, sAvezfuture]}, index =
 ['LIWC', 'Spacy'] )
modfplot.plot.bar(rot=0)
plt.show()

means equivalence test past and present, present and future

cases corpus

In [51]:
# POS tagging  antpast, antpresent, antfuture
In [136]:
ttest_ind(df["antpast"], df["antpresent"])
# high t-statistic and low p-value indicate the mean of past tenses and present tenses are not equal
Out[136]:
Ttest_indResult(statistic=3.9810333139896397, pvalue=6.903741440270563e-05)
In [137]:
ttest_ind(df["antpresent"], df["antfuture"])
# high t-statistic and low p-value indicate the mean of present tenses and future tenses are not equal
Out[137]:
Ttest_indResult(statistic=210.58070597007398, pvalue=0.0)

political speeches

In [138]:
# past tense   vs   present tense    both speakers
ttest_ind(dfmo['antpast'], dfmo['antpresent'], equal_var=False, nan_policy='omit')
# results show strongly significant difference between past tense usage and present tense usage
Out[138]:
Ttest_indResult(statistic=-57.09492905140832, pvalue=1.2021268647135612e-218)
In [139]:
# present tense   vs   future tense    both speakers
ttest_ind(dfmo['antpresent'], dfmo['antfuture'], equal_var=False, nan_policy='omit')
# results show strongly significant difference between present tense usage and future tense usage
Out[139]:
Ttest_indResult(statistic=90.73799544999683, pvalue=1.6028586258298012e-279)

reuters data set

In [140]:
ttest_ind(dfre['antpast'], dfre['antpresent'], equal_var = False, nan_policy='omit')
# deliberate use of past, present tenses
Out[140]:
Ttest_indResult(statistic=38.56836441489094, pvalue=0.0)
In [141]:
ttest_ind(dfre['antpresent'], dfre['antfuture'], equal_var = False, nan_policy='omit')
# deliberate use of present, future tenses
Out[141]:
Ttest_indResult(statistic=85.92888469014095, pvalue=0.0)

twitter data set

In [142]:
ttest_ind(dftw['antpast'], dftw['antpresent'], equal_var = False, nan_policy='omit')
# deliberate use of past, present tenses
Out[142]:
Ttest_indResult(statistic=-100.5204464921747, pvalue=0.0)
In [143]:
ttest_ind(dftw['antpresent'], dftw['antfuture'], equal_var = False, nan_policy='omit')
# deliberate use of present, future tenses
Out[143]:
Ttest_indResult(statistic=150.55508675466808, pvalue=0.0)

cases corpus, republican vs democratic

In [144]:
df_republican = df[ df["Party"] == "Republican"]
df_democratic = df[ df["Party"] == "Democratic"]
In [145]:
# POS tagging    past tense, present tense, future tense    rep 
rpas = df_republican['npast'].sum()
rprs = df_republican['npresent'].sum()
rfus = df_republican ['nfuture'].sum()

rAvepast, rAvepresent, rAvefuture = [rpas, rprs, rfus]/(rpas + rprs + rfus)
In [146]:
# POS tagging    past tense, present tense, future tense    dem
dpas = df_democratic['npast'].sum()
dprs = df_democratic['npresent'].sum()
dfus = df_democratic['nfuture'].sum()

dAvepast, dAvepresent, dAvefuture = [dpas, dprs, dfus]/(dpas + dprs + dfus)
In [147]:
# POS tagging    past tense, present tense, future tense    rep vs dem
modfplot = pd.DataFrame({'Ave1past' : [rAvepast, dAvepast], 'Ave2present': [rAvepresent, dAvepresent], 'Avef3future': [rAvefuture, dAvefuture]}, index = ['rep', 'dem'] )
modfplot.plot.bar(rot=0)
plt.show()
In [148]:
# POS tagging  past tense    rep vs dem
ttest_ind(df_republican["antpast"], df_democratic["antpast"])
# test result indicating use of past tense not significantly different between republican and democratic judges
Out[148]:
Ttest_indResult(statistic=1.4863221647624705, pvalue=0.13726194142666256)
In [149]:
# POS tagging  present tense  rep vs dem
ttest_ind(df_republican["antpresent"], df_democratic["antpresent"])
# test result indicating use of present tense not significantly different between republican and democratic judges
Out[149]:
Ttest_indResult(statistic=-1.5542463287572683, pvalue=0.12019394412787038)
In [150]:
# POS tagging  future tense  rep vs dem
ttest_ind(df_republican["antfuture"], df_democratic["antfuture"])
# test result indicating use of future tense not significantly different between republican and democratic judges
Out[150]:
Ttest_indResult(statistic=0.8914861232937964, pvalue=0.3727147081497669)
In [151]:
# LIWC   past focus, present focus, future focus    rep vs dem
rfpas = df_republican['nfpast'].sum()
rfprs = df_republican['nfpresent'].sum()
rffus = df_republican ['nffuture'].sum()

rAvefpast, rAvefpresent, rAveffuture = [rfpas, rfprs, rffus]/(rfpas + rfprs + rffus)

dfpas = df_democratic['nfpast'].sum()
dfprs = df_democratic['nfpresent'].sum()
dffus = df_democratic['nffuture'].sum()

dAvefpast, dAvefpresent, dAveffuture = [dfpas, dfprs, dffus]/(dfpas + dfprs + dffus)
In [152]:
modfplot = pd.DataFrame({'Avef1past' : [rAvefpast, dAvefpast], 'Avef2present': [rAvefpresent, dAvefpresent], 'Avef3future': [rAveffuture, dAveffuture]}, index = ['rep', 'dem'] )
modfplot.plot.bar(rot=0)
plt.show()
In [153]:
# LIWC  past focus   rep vs dem
ttest_ind(df_republican["antfpast"], df_democratic["antfpast"])
# test result indicating use of past focus not significantly different between republican and democratic judges
Out[153]:
Ttest_indResult(statistic=1.1871225805247378, pvalue=0.23524010971524925)
In [154]:
# LIWC  present focus  rep vs dem
ttest_ind(df_republican["antfpresent"], df_democratic["antfpresent"])
# test result indicating use of present focus not significantly different between republican and democratic judges
Out[154]:
Ttest_indResult(statistic=-1.8136488402454323, pvalue=0.06979644819482464)
In [155]:
# LIWC  future focus  rep vs dem
ttest_ind(df_republican["antffuture"], df_democratic["antffuture"])
# test result indicating use of future focus not significantly different between republican and democratic judges
Out[155]:
Ttest_indResult(statistic=1.5543266654359942, pvalue=0.12017478973887423)

political speeches dataset McCain vs Obama

In [156]:
opas = dfmo.loc[ dfmo['speaker'] == 'Obama', 'npast'].sum()
oprs = dfmo.loc[ dfmo['speaker'] == 'Obama', 'npresent'].sum()
ofus = dfmo.loc[ dfmo['speaker'] == 'Obama', 'nfuture'].sum()

oAvepast, oAvepresent, oAvefuture = [opas, oprs, ofus]/(opas + oprs + ofus)

mpas = dfmo.loc[ dfmo['speaker'] == 'McCain', 'npast'].sum()
mprs = dfmo.loc[ dfmo['speaker'] == 'McCain', 'npresent'].sum()
mfus = dfmo.loc[ dfmo['speaker'] == 'McCain', 'nfuture'].sum()

mAvepast, mAvepresent, mAvefuture = [mpas, mprs, mfus]/(mpas + mprs + mfus)
In [157]:
print("POS tagging\nMcCain:", mAvepast, mAvepresent, mAvefuture, "\nObama:", oAvepast, oAvepresent, oAvefuture)
POS tagging
McCain: 0.21972200367164962 0.6712824547600315 0.10899554156831891 
Obama: 0.21573208722741433 0.6797003848268279 0.10456752794575774
In [158]:
modtplot = pd.DataFrame({'Ave1past':[mAvepast, oAvepast], 'Ave2present': [mAvepresent, oAvepresent], 'Ave3future':[mAvefuture, oAvefuture]}, index = ['McCain', 'Obama'])
modtplot.plot.bar(rot = 0, sort_columns=False)
plt.show()
In [159]:
dfmo_McCain = dfmo[ dfmo["speaker"] == 'McCain' ]
In [160]:
dfmo_Obama = dfmo[ dfmo["speaker"] == 'Obama' ]
In [161]:
# past tense   Obama vs McCain
ttest_ind(dfmo_Obama['antpast'], dfmo_McCain['antpast'], equal_var=False, nan_policy='omit')
# no significant difference 
Out[161]:
Ttest_indResult(statistic=0.5124305273839909, pvalue=0.6091905680757264)
In [162]:
# present tense   Obama vs McCain
ttest_ind(dfmo_Obama['antpresent'], dfmo_McCain['antpresent'], equal_var=False, nan_policy='omit')
# no significant difference 
Out[162]:
Ttest_indResult(statistic=0.31705187375818283, pvalue=0.7516874694842609)
In [163]:
# future tense   Obama vs McCain
ttest_ind(dfmo_Obama['antfuture'], dfmo_McCain['antfuture'], equal_var=False, nan_policy='omit')
# no significant difference
Out[163]:
Ttest_indResult(statistic=-1.50044661478685, pvalue=0.13552777192035292)
In [168]:
dfmo['modal_ratio'] = dfmo['lmodal'] / dfmo['nverbs']
In [169]:
dfmo.loc[ dfmo["speaker"] == 'McCain', 'modal_ratio' ].describe()
Out[169]:
count    96.000000
mean      0.020195
std       0.014527
min       0.000000
25%       0.009494
50%       0.017477
75%       0.027122
max       0.063232
Name: modal_ratio, dtype: float64
In [170]:
dfmo.loc[ dfmo["speaker"] == 'Obama', 'modal_ratio' ].describe()
Out[170]:
count    155.000000
mean       0.012669
std        0.009023
min        0.000000
25%        0.006682
50%        0.011173
75%        0.016649
max        0.065574
Name: modal_ratio, dtype: float64
In [172]:
dfmo_McCain = dfmo[ dfmo["speaker"] == 'McCain' ]
dfmo_Obama = dfmo[ dfmo["speaker"] == 'Obama' ]
In [173]:
# Obama vs McCain  usage of modal (would , could, might)
ttest_ind(dfmo_Obama['modal_ratio'], dfmo_McCain['modal_ratio'], equal_var = False, nan_policy='omit')
# test statistics and p value indicate than the mean modal_ratio is significantly different in McCain's political speeches
# vs Obama's speeches
Out[173]:
Ttest_indResult(statistic=-4.56032283856477, pvalue=1.0999375088641693e-05)

deontic future

In [174]:
from tp_utils import *
In [175]:
df.loc[df["ldeont"] > 0, :].sort_values("ldeont", ascending = False)
Out[175]:
case_reversed judge_id year log_cites LastName FirstName Gender Pres Party nlets ... antpresent antfuture nfpast nfpresent nffuture antfpast antfpresent antffuture ldeont lmodal
caseid
X3K1QD 1 1231 1938 0.000000 Kaufman Irving 1 Harry S Truman Democratic 19366.0 ... 0.327103 0.228972 131.0 127.0 65.0 0.405573 0.393189 0.201238 21.0 1.0
X3IMFV 1 1119 1933 0.693147 Hull Frank 0 William J. Clinton Democratic 83982.0 ... 0.435115 0.070883 648.0 861.0 153.0 0.389892 0.518051 0.092058 20.0 53.0
X3JGGO 0 1653 1925 1.098612 Miller Wilbur 1 Harry S Truman Democratic 46558.0 ... 0.363874 0.172775 280.0 308.0 101.0 0.406386 0.447025 0.146589 18.0 7.0
X3BD4Q 0 1795 1926 0.000000 O'Connor James 1 Franklin D. Roosevelt Democratic 20299.0 ... 0.572973 0.248649 59.0 208.0 67.0 0.176647 0.622754 0.200599 18.0 5.0
X40AA9 1 1951 1940 1.386294 Quist Gordon 1 George H.W. Bush Republican 39473.0 ... 0.362587 0.066975 276.0 270.0 80.0 0.440895 0.431310 0.127796 17.0 12.0
X3GPDN 0 0 0 0.000000 -1 14887.0 ... 0.556962 0.145570 56.0 114.0 38.0 0.269231 0.548077 0.182692 13.0 10.0
X46NQ9 0 2241 1938 1.386294 Soper Morris 1 Warren G. Harding Republican 26036.0 ... 0.410828 0.066879 165.0 188.0 36.0 0.424165 0.483290 0.092545 12.0 8.0
X3B742 1 2266 1939 0.693147 Stahl David 1 Lyndon B. Johnson Democratic 34981.0 ... 0.572816 0.090615 103.0 262.0 51.0 0.247596 0.629808 0.122596 12.0 4.0
X481KI 0 2021 1944 1.098612 Robb Charles 1 Theodore Roosevelt Republican 5154.0 ... 0.423729 0.254237 23.0 47.0 18.0 0.261364 0.534091 0.204545 11.0 3.0
X4A0C7 1 1826 1934 1.098612 Paine Elijah 1 John Adams Federalist 61747.0 ... 0.488654 0.048411 322.0 542.0 67.0 0.345865 0.582170 0.071966 11.0 31.0
X3FT5C 1 1299 1929 0.000000 Knowles Hiram 1 Benjamin Harrison Republican 31729.0 ... 0.379747 0.065823 306.0 227.0 51.0 0.523973 0.388699 0.087329 11.0 20.0
X44CAP 0 2018 1951 3.555348 Rives Alexander 1 Ulysses Grant Republican 84901.0 ... 0.389105 0.064202 493.0 649.0 125.0 0.389108 0.512234 0.098658 11.0 56.0
X47UBJ 1 206 1943 1.609438 Bond Hugh 1 Ulysses Grant Republican 12992.0 ... 0.373239 0.176056 74.0 96.0 34.0 0.362745 0.470588 0.166667 10.0 4.0
X2S1VV 1 1040 1924 0.000000 Higginbotham Patrick 1 Gerald Ford Republican 16749.0 ... 0.516556 0.112583 84.0 136.0 37.0 0.326848 0.529183 0.143969 10.0 9.0
X3IMHM 0 2241 1933 0.693147 Soper Morris 1 Warren G. Harding Republican 38841.0 ... 0.440506 0.070886 273.0 295.0 42.0 0.447541 0.483607 0.068852 10.0 16.0
X4ABIU 1 1327 1935 1.791759 Laffitte Héctor 1 Ronald Reagan Republican 12413.0 ... 0.318471 0.101911 106.0 91.0 28.0 0.471111 0.404444 0.124444 9.0 4.0
X46COO 1 2241 1937 1.609438 Soper Morris 1 Warren G. Harding Republican 83391.0 ... 0.568421 0.033918 348.0 614.0 84.0 0.332696 0.586998 0.080306 9.0 36.0
X3CESF 1 1034 1926 2.484907 Hickenlooper Smith 1 Warren G. Harding Republican 14225.0 ... 0.373134 0.238806 79.0 107.0 43.0 0.344978 0.467249 0.187773 9.0 5.0
X3GDBM 0 1951 1929 1.791759 Quist Gordon 1 George H.W. Bush Republican 23360.0 ... 0.566802 0.093117 95.0 169.0 48.0 0.304487 0.541667 0.153846 9.0 18.0
X3CF30 1 1591 1926 1.098612 McNamee Charles 1 Harry S Truman Democratic 13202.0 ... 0.352113 0.126761 75.0 109.0 38.0 0.337838 0.490991 0.171171 9.0 2.0
X3ITEQ 1 1826 1934 1.386294 Paine Elijah 1 John Adams Federalist 51885.0 ... 0.448692 0.064386 248.0 339.0 72.0 0.376328 0.514416 0.109256 9.0 9.0
X1G8M7K003 0 1326 2011 2.564949 Lacombe Emile 1 Grover Cleveland Democratic 81010.0 ... 0.640969 0.042952 185.0 652.0 103.0 0.196809 0.693617 0.109574 9.0 21.0
X17JCS0003 1 31 2007 2.708050 Allgood Clarence 1 John F. Kennedy Democratic 53065.0 ... 0.551402 0.076636 219.0 345.0 81.0 0.339535 0.534884 0.125581 9.0 28.0
X53HAD 0 1764 1924 0.000000 Nielsen Leland 1 Richard M. Nixon Republican 32994.0 ... 0.524927 0.052786 152.0 331.0 49.0 0.285714 0.622180 0.092105 9.0 8.0
X410OJ 1 1635 1949 1.609438 Mickelson George 1 Dwight D. Eisenhower Republican 36701.0 ... 0.414493 0.107246 171.0 232.0 66.0 0.364606 0.494670 0.140725 9.0 9.0
X4DILP 1 512 1939 0.000000 Cosgrave George 1 Herbert Hoover Republican 45868.0 ... 0.538117 0.067265 183.0 335.0 66.0 0.313356 0.573630 0.113014 9.0 27.0
X3BIQI 0 844 1942 2.302585 Gibbons John 1 Richard M. Nixon Republican 16225.0 ... 0.479769 0.080925 86.0 107.0 25.0 0.394495 0.490826 0.114679 8.0 8.0
X467VU 0 1905 1936 2.302585 Pollard Robert 1 Franklin D. Roosevelt Democratic 22096.0 ... 0.600000 0.097561 85.0 194.0 88.0 0.231608 0.528610 0.239782 8.0 6.0
X3JC9R 0 1478 1935 1.791759 Maris Albert 1 Franklin D. Roosevelt Democratic 28176.0 ... 0.562682 0.037901 152.0 268.0 32.0 0.336283 0.592920 0.070796 8.0 9.0
X47T9L 1 0 0 0.000000 -1 37381.0 ... 0.605485 0.061181 151.0 346.0 51.0 0.275547 0.631387 0.093066 8.0 25.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
X41HL1 1 0 0 0.000000 -1 2449.0 ... 0.208333 0.125000 20.0 23.0 5.0 0.416667 0.479167 0.104167 1.0 0.0
X48HMV 1 1327 1952 1.791759 Laffitte Héctor 1 Ronald Reagan Republican 3383.0 ... 0.233333 0.033333 24.0 17.0 2.0 0.558140 0.395349 0.046512 1.0 0.0
X467AF 0 1635 1935 1.386294 Mickelson George 1 Dwight D. Eisenhower Republican 5298.0 ... 0.397260 0.027397 26.0 38.0 11.0 0.346667 0.506667 0.146667 1.0 2.0
X48F4B 1 2021 1951 1.609438 Robb Charles 1 Theodore Roosevelt Republican 7023.0 ... 0.326923 0.009615 66.0 51.0 9.0 0.523810 0.404762 0.071429 1.0 2.0
X45BNP 0 0 0 0.000000 -1 12410.0 ... 0.546099 0.014184 56.0 123.0 5.0 0.304348 0.668478 0.027174 1.0 7.0
X40RCL 1 2166 1946 0.693147 Shaw Robert 1 John F. Kennedy Democratic 10675.0 ... 0.279070 0.007752 86.0 56.0 1.0 0.601399 0.391608 0.006993 1.0 1.0
X3C9U4 1 1898 1947 0.693147 Plunkett Paul 1 Ronald Reagan Republican 9540.0 ... 0.335664 0.006993 92.0 70.0 7.0 0.544379 0.414201 0.041420 1.0 9.0
X3JI98 0 2159 1936 1.791759 Shapiro Norma 0 Jimmy Carter Democratic 15308.0 ... 0.137097 0.048387 109.0 54.0 11.0 0.626437 0.310345 0.063218 1.0 2.0
X40UBB 1 1951 1947 1.945910 Quist Gordon 1 George H.W. Bush Republican 23140.0 ... 0.397727 0.022727 142.0 158.0 24.0 0.438272 0.487654 0.074074 1.0 13.0
X40V5B 1 682 1948 1.386294 Economus Peter 1 William J. Clinton Democratic 9872.0 ... 0.477477 0.018018 53.0 64.0 7.0 0.427419 0.516129 0.056452 1.0 14.0
X40Q5D 1 1006 1946 2.484907 Hay George 1 John Quincy Adams Jeffersonian Republican 12236.0 ... 0.567073 0.060976 72.0 105.0 16.0 0.373057 0.544041 0.082902 1.0 5.0
X43CB7 0 1327 1936 0.693147 Laffitte Héctor 1 Ronald Reagan Republican 4114.0 ... 0.553191 0.106383 13.0 34.0 10.0 0.228070 0.596491 0.175439 1.0 0.0
X40V8D 1 1951 1948 2.302585 Quist Gordon 1 George H.W. Bush Republican 20604.0 ... 0.522727 0.040909 124.0 170.0 20.0 0.394904 0.541401 0.063694 1.0 13.0
X43CEH 0 2021 1936 1.791759 Robb Charles 1 Theodore Roosevelt Republican 6139.0 ... 0.407895 0.026316 49.0 44.0 10.0 0.475728 0.427184 0.097087 1.0 3.0
X9VDDL 0 0 0 0.000000 -1 5849.0 ... 0.478261 0.028986 29.0 43.0 13.0 0.341176 0.505882 0.152941 1.0 5.0
X4AIR4 0 1951 1936 2.197225 Quist Gordon 1 George H.W. Bush Republican 19018.0 ... 0.274286 0.040000 114.0 104.0 13.0 0.493506 0.450216 0.056277 1.0 3.0
X410PI 1 1678 1949 3.178054 Moore Kevin 1 George H.W. Bush Republican 16381.0 ... 0.740566 0.033019 45.0 168.0 22.0 0.191489 0.714894 0.093617 1.0 14.0
X3CTGU 1 1678 1951 0.000000 Moore Kevin 1 George H.W. Bush Republican 11174.0 ... 0.589552 0.134328 18.0 93.0 28.0 0.129496 0.669065 0.201439 1.0 0.0
X48A87 1 850 1950 2.772589 Gibson John 1 Ronald Reagan Republican 19458.0 ... 0.382979 0.034574 215.0 205.0 27.0 0.480984 0.458613 0.060403 1.0 28.0
X3CN9R 1 1121 1950 2.708050 Humphrey J. 1 William McKinley Republican 52923.0 ... 0.272601 0.004172 468.0 318.0 35.0 0.570037 0.387333 0.042631 1.0 38.0
X43C5D 0 485 1936 2.484907 Comiskey James 1 Lyndon B. Johnson Democratic 27363.0 ... 0.265455 0.007273 198.0 164.0 20.0 0.518325 0.429319 0.052356 1.0 14.0
X3CQNT 1 879 1950 2.564949 Gonzalez Irma 0 George H.W. Bush Republican 13292.0 ... 0.625641 0.025641 77.0 139.0 18.0 0.329060 0.594017 0.076923 1.0 19.0
X448DG 1 957 1950 1.945910 Hamilton David 1 William J. Clinton Democratic 18225.0 ... 0.402985 0.034826 87.0 139.0 12.0 0.365546 0.584034 0.050420 1.0 12.0
X48C38 1 627 1950 2.833213 Dodge Frederic 1 Theodore Roosevelt Republican 36656.0 ... 0.473418 0.020253 217.0 272.0 24.0 0.423002 0.530214 0.046784 1.0 25.0
X43Q6I 1 361 1950 1.098612 Campbell John 1 Franklin Pierce Democratic 24433.0 ... 0.578595 0.013378 138.0 238.0 22.0 0.346734 0.597990 0.055276 1.0 13.0
X41PD4 0 364 1959 1.098612 Campbell Marcus 1 Warren G. Harding Republican 9779.0 ... 0.723214 0.035714 25.0 106.0 10.0 0.177305 0.751773 0.070922 1.0 8.0
X44981 1 1381 1951 0.000000 Letts Fred 1 Herbert Hoover Republican 11694.0 ... 0.398810 0.047619 86.0 114.0 18.0 0.394495 0.522936 0.082569 1.0 8.0
X43QTQ 1 0 0 0.000000 -1 11884.0 ... 0.603053 0.022901 47.0 94.0 10.0 0.311258 0.622517 0.066225 1.0 7.0
X48DTO 1 1121 1951 0.000000 Humphrey J. 1 William McKinley Republican 8037.0 ... 0.440000 0.020000 53.0 41.0 6.0 0.530000 0.410000 0.060000 1.0 4.0
X9VH48 0 928 1965 1.945910 Guinn Ernest 1 Lyndon B. Johnson Democratic 10597.0 ... 0.500000 0.044776 54.0 78.0 12.0 0.375000 0.541667 0.083333 1.0 7.0

1592 rows × 29 columns

In [176]:
findd(df.loc["X4AKVH", "doc"])     # find deontic futures in one of cases corpus's doc
----------------------------------------------------:
number of matches: 3
court shall direct
receiver shall pay
shareholder shall contribute
In [177]:
findd(df.loc["X42IM4", "doc"])     # find deontic futures in one of cases corpus's doc
----------------------------------------------------:
number of matches: 3
State shall provide
person shall be
State shall provide
In [178]:
 findd(df.loc["X3IMFV", "doc"])  # find deontic futures in one of cases corpus's doc
----------------------------------------------------:
number of matches: 20
year shall be
trust shall be
corpus shall discharge
act shall be
trust shall be
trustee shall collect
trustee shall take
estate shall be
estate shall be
trust shall fail
accumulation shall be
estate shall revert
paragraph shall be
estate shall revert
hereunder shall be
estate shall not produce
delivery shall be
amount shall be
herein shall be
accumulation shall be
In [179]:
plt.hist(df["lmodal"], bins = 50, alpha = 0.5, label = "cases ", range = (1,50), color = 'g', density = 1)
plt.hist(dfmo["lmodal"], bins = 50, alpha = 0.5, label = "pol speeches ",  range = (1,50), color = 'b', density = 1)
plt.hist(dfre["lmodal"], bins = 50, alpha = 0.5, label = "reuters ",  range = (1,50), color = 'r', density = 1)
plt.hist(dftw["lmodal"], bins = 50, alpha = 0.5, label = "twitter ",  range = (1,50), color = 'violet', density = 1)
plt.title("absolute numbers of modal verbs in 4 datasets , restricted to >= 1 modal ")
plt.legend(loc='upper right')
plt.show()
In [ ]:
 
In [180]:
# remember df = pickle.load(open("/hdf = pickle.load (open("pj_df_full.20190629_095112.pkl", "rb")) ome/xhta/Robot/proj/pj_df_full.20190629_095112.pkl", "rb"))
import pickle
df = pickle.load (open("pj_df_full.20190629_095112.pkl", "rb")) 
In [181]:
df2 = df [ (df['Party'] == 'Republican' ) | (df['Party'] == 'Democratic' ) ]
In [182]:
len(df2)
Out[182]:
4637
In [183]:
df2.Party.replace(['Republican', 'Democratic'], [1, 0], inplace = True)
In [184]:
df2.head()
Out[184]:
case_reversed judge_id year log_cites LastName FirstName Gender Pres Party nlets ... antpresent antfuture nfpast nfpresent nffuture antfpast antfpresent antffuture ldeont lmodal
caseid
X3JGGO 0 1653 1925 1.098612 Miller Wilbur 1 Harry S Truman 0 46558.0 ... 0.363874 0.172775 280.0 308.0 101.0 0.406386 0.447025 0.146589 18.0 7.0
X3OH3J 0 1034 1924 1.609438 Hickenlooper Smith 1 Warren G. Harding 1 16689.0 ... 0.335025 0.005076 123.0 94.0 12.0 0.537118 0.410480 0.052402 0.0 8.0
X3U0KO 0 2303 1925 1.791759 Story William 1 Ulysses Grant 1 5044.0 ... 0.527273 0.036364 31.0 39.0 8.0 0.397436 0.500000 0.102564 0.0 3.0
X53HAD 0 1764 1924 0.000000 Nielsen Leland 1 Richard M. Nixon 1 32994.0 ... 0.524927 0.052786 152.0 331.0 49.0 0.285714 0.622180 0.092105 9.0 8.0
X9VC5V 0 493 1925 0.000000 Connally Ben 1 Harry S Truman 0 1276.0 ... 0.181818 0.000000 15.0 7.0 0.0 0.681818 0.318182 0.000000 0.0 1.0

5 rows × 29 columns

In [185]:
from sklearn.model_selection import train_test_split
In [186]:
features = ['antpast', 'antfuture', 'Party', 'nlets', 'lmodal', 'year']
In [187]:
X = df2[features]
In [188]:
X_train, X_test = train_test_split(df2, test_size = 0.3, random_state = 1234)

Linear Regression

In [189]:
from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit(df2[features], df2['log_cites'])
Out[189]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
In [190]:
reg.score(df2[features], df2['log_cites'])
Out[190]:
0.17426705785040097
In [191]:
features = ['nfpast', 'nfpresent', 'nfuture', 'Party', 'nlets', 'lmodal', 'year']
In [192]:
reg = linear_model.LinearRegression()
reg.fit(df2[features], df2['log_cites'])
reg.score(df2[features], df2['log_cites'])
Out[192]:
0.1736947601005383
In [36]:
from sklearn import linear_model
In [193]:
features = ['nfpast', 'nfpresent', 'nfuture', 'Party', 'nlets', 'lmodal', 'year']

LogisticRegression

In [194]:
from sklearn.linear_model import LogisticRegression
In [195]:
clflo = LogisticRegression(random_state = 1234, solver = 'liblinear').fit(X_train[features], X_train['case_reversed'])
In [196]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
In [197]:
ypreda = clflo.predict(X_train[features])
yprede = clflo.predict(X_test[features])
In [198]:
print(accuracy_score(X_train['case_reversed'], ypreda), accuracy_score(X_test['case_reversed'], yprede))
0.6305084745762712 0.6293103448275862

Naive Bayes Classifier

In [199]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

features = [ 'nsents', 'npast', 'nfuture', 'lmodal', 'Party', 'year']
gnb.fit(X_train[features].values, X_train['case_reversed'])
y_pred = gnb.predict(X_test[features])

cm = confusion_matrix(X_test['case_reversed'], y_pred)
tn, fp, fn, tp = cm.ravel()
print ("accuracy:", (tn + tp) / np.sum(cm))
accuracy: 0.617816091954023

svm classifier

In [201]:
from sklearn import svm
C = 1.0
models = ( svm.SVC(kernel = 'linear' , C = C), svm.LinearSVC (C = C), svm.SVC(kernel = 'rbf', gamma = 'auto', C=C))

models = (clf.fit(X_train[features], X_train['case_reversed']) for clf in models)

cms = []
cms = (confusion_matrix(X_test['case_reversed'], clf.predict(X_test[features])) for clf in models)

for cm in cms:
    tn, fp, fn, tp = cm.ravel()
    print ("accuracy:", (tn + tp) / np.sum(cm))
accuracy: 0.6307471264367817
accuracy: 0.6307471264367817
accuracy: 0.6285919540229885

SGD Classifier

In [202]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
In [203]:
features = ['antpast', 'antfuture', 'Party', 'nlets', 'lmodal', 'year']
X = df2[features]
y = df2['log_cites']
X_train, X_test = train_test_split(df2, test_size = 0.3, random_state = 1234)
In [204]:
n_folds = 10
kf = KFold(n_splits = n_folds, shuffle=True, random_state = 1234)

cv_results = cross_val_score(SGDClassifier(max_iter = 1000, tol = 1e-03, loss='log', penalty = 'l2'), X_train[features],
                                                   X_train['case_reversed'],  scoring = 'accuracy', cv=kf)

print ("accuracy  mean:", cv_results.mean(), "   std:", cv_results.std())
accuracy  mean: 0.5928793922127256    std: 0.07754784024166905

LogisticRegressionCV

In [205]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv = 10, random_state = 1234, multi_class ='ovr').fit(X_train[features], X_train['case_reversed'])
clf.score(X_train[features], X_train['case_reversed'])
Out[205]:
0.6329738058551618

ensemble

In [206]:
from sklearn import clone
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier)
from sklearn.tree import DecisionTreeClassifier

n_estimators = 100
models = [DecisionTreeClassifier(max_depth=None), RandomForestClassifier(n_estimators=n_estimators),
ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators)]
#
for model in models:
    clf = clone(model)
    clf = model.fit(X_train[features], X_train['case_reversed'])
#
    y_pred = clf.predict(X_test[features])
    cm = confusion_matrix(X_test['case_reversed'], y_pred)
#
    tn, fp, fn, tp = cm.ravel()
    print ("accuracy:", (tn + tp) / np.sum(cm))
accuracy: 0.5330459770114943
accuracy: 0.5761494252873564
accuracy: 0.5833333333333334
accuracy: 0.5732758620689655

instrument variables

In [207]:
df = pickle.load (open("pj_df_full.20190629_095112.pkl", "rb"))      # cases corpus
In [208]:
df2 = df [ (df['Party'] == 'Republican' ) | (df['Party'] == 'Democratic' ) ]
In [209]:
df2.Party.replace(['Republican', 'Democratic'], [1, 0], inplace = True)
In [210]:
from linearmodels.iv import IV2SLS
In [211]:
df2['_const'] = 1
In [212]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
In [213]:
import time
start_time = time.time()

sid = SentimentIntensityAnalyzer()
for inde in df2.index:
    ss = sid.polarity_scores(str(df2.loc[inde, 'doc']))
    df2.loc[inde, 'cs'] = ss['compound']
# this step takes quite long to complete
print (time.time() - start_time)
2058.4527649879456
In [214]:
features = ['_const', 'antpast', 'antfuture',  'nlets', 'lmodal', 'year', 'Party', 'cs']
In [215]:
res_ols = IV2SLS(dependent=df2['log_cites'], exog = df2[features], endog = None, instruments = None).fit(cov_type='clustered', clusters=df2['year'])
print(res_ols)
                            OLS Estimation Summary                            
==============================================================================
Dep. Variable:              log_cites   R-squared:                      0.1814
Estimator:                        OLS   Adj. R-squared:                 0.1801
No. Observations:                4637   F-statistic:                    328.98
Date:                Tue, Jul 16 2019   P-value (F-stat)                0.0000
Time:                        14:43:55   Distribution:                  chi2(7)
Cov. Estimator:             clustered                                         
                                                                              
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
_const        -9.8864     3.7371    -2.6455     0.0082     -17.211     -2.5618
antpast       -0.3766     0.1261    -2.9852     0.0028     -0.6238     -0.1293
antfuture     -2.7617     0.6951    -3.9733     0.0001     -4.1240     -1.3994
nlets       1.748e-05  2.261e-06     7.7329     0.0000   1.305e-05   2.192e-05
lmodal         0.0046     0.0023     2.0161     0.0438      0.0001      0.0091
year           0.0059     0.0019     3.1357     0.0017      0.0022      0.0096
Party          0.0393     0.0299     1.3164     0.1880     -0.0192      0.0979
cs            -0.1027     0.0157    -6.5410     0.0000     -0.1334     -0.0719
==============================================================================
In [216]:
features = ['_const', 'antpast', 'antfuture',  'nlets', 'lmodal', 'year', 'Party']
In [217]:
res_1st = IV2SLS(dependent=df2[ 'cs'], exog = df2[features], endog = None, instruments = None).fit(cov_type='clustered', clusters=df2['year'])
print(res_1st)
                            OLS Estimation Summary                            
==============================================================================
Dep. Variable:                     cs   R-squared:                      0.0679
Estimator:                        OLS   Adj. R-squared:                 0.0667
No. Observations:                4637   F-statistic:                    316.17
Date:                Tue, Jul 16 2019   P-value (F-stat)                0.0000
Time:                        14:45:12   Distribution:                  chi2(6)
Cov. Estimator:             clustered                                         
                                                                              
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
_const         14.834     1.2497     11.870     0.0000      12.385      17.284
antpast       -1.3474     0.1182    -11.396     0.0000     -1.5791     -1.1157
antfuture      1.1555     0.6613     1.7472     0.0806     -0.1407      2.4516
nlets       9.039e-06  1.444e-06     6.2599     0.0000   6.209e-06   1.187e-05
lmodal        -0.0081     0.0019    -4.3199     0.0000     -0.0118     -0.0044
year          -0.0071     0.0006    -11.404     0.0000     -0.0083     -0.0059
Party         -0.0130     0.0274    -0.4771     0.6333     -0.0667      0.0406
==============================================================================
In [218]:
features = ['_const', 'antpast', 'antfuture',  'nlets', 'lmodal', 'year']
In [219]:
res_2nd = IV2SLS(dependent=df2['log_cites'], exog = df2[['_const']], endog = df2['cs'], instruments = df2[['antpast', 'antfuture',  'nlets', 'lmodal', 'year', 'Party']]).fit(cov_type='clustered', clusters=df2['year'])
print(res_2nd)
                          IV-2SLS Estimation Summary                          
==============================================================================
Dep. Variable:              log_cites   R-squared:                      0.0045
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0043
No. Observations:                4637   F-statistic:                    2.4711
Date:                Tue, Jul 16 2019   P-value (F-stat)                0.1160
Time:                        14:45:16   Distribution:                  chi2(1)
Cov. Estimator:             clustered                                         
                                                                              
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
_const         1.9727     0.0407     48.505     0.0000      1.8930      2.0525
cs            -0.1818     0.1157    -1.5720     0.1160     -0.4086      0.0449
==============================================================================

Endogenous: cs
Instruments: antpast, antfuture, nlets, lmodal, year, Party
Clustered Covariance (One-Way)
Debiased: False
Num Clusters: 90
In [220]:
res_2nd.wu_hausman()
# the wu hausman statistic is a variant of the Durbin-Watson test for instrument variables
# (H0 is rejected if test statistic > critical value  or p-value < a critical value)
# Davidson, MacKinnon : Estimation and Inference in Ecnometrics  Chapter 8 
# here H0 can be rejected , hence not all endogenous variables are exogenous
Out[220]:
Wu-Hausman test of exogeneity
H0: All endogenous variables are exogenous
Statistic: 47.4757
P-value: 0.0000
Distributed: F(1,4634)
WaldTestStatistic, id: 0x7f371356e3c8

statsmodels

In [45]:
import statsmodels.api as sm
In [46]:
features = ['antpast', 'antfuture', 'Party', 'nlets', 'lmodal', 'year']
In [47]:
X = df2[features]
X.head()
Out[47]:
antpast antfuture Party nlets lmodal year
caseid
X3JGGO 0.463351 0.172775 0 46558.0 7.0 1925
X3OH3J 0.659898 0.005076 1 16689.0 8.0 1924
X3U0KO 0.436364 0.036364 1 5044.0 3.0 1925
X53HAD 0.422287 0.052786 1 32994.0 8.0 1924
X9VC5V 0.818182 0.000000 0 1276.0 1.0 1925
In [48]:
y = df2['log_cites']
In [49]:
X = sm.add_constant(X)
/home/xhta/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
  return ptp(axis=axis, out=out, **kwargs)
In [50]:
X.head()
Out[50]:
const antpast antfuture Party nlets lmodal year
caseid
X3JGGO 1.0 0.463351 0.172775 0 46558.0 7.0 1925
X3OH3J 1.0 0.659898 0.005076 1 16689.0 8.0 1924
X3U0KO 1.0 0.436364 0.036364 1 5044.0 3.0 1925
X53HAD 1.0 0.422287 0.052786 1 32994.0 8.0 1924
X9VC5V 1.0 0.818182 0.000000 0 1276.0 1.0 1925
In [51]:
sm_model = sm.OLS(y, X).fit()
In [52]:
pred = sm_model.predict(X)
In [53]:
sm_model.summary()
Out[53]:
OLS Regression Results
Dep. Variable: log_cites R-squared: 0.174
Model: OLS Adj. R-squared: 0.173
Method: Least Squares F-statistic: 162.9
Date: Tue, 16 Jul 2019 Prob (F-statistic): 2.49e-188
Time: 08:35:01 Log-Likelihood: -6513.4
No. Observations: 4637 AIC: 1.304e+04
Df Residuals: 4630 BIC: 1.309e+04
Df Model: 6
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const -11.4096 1.444 -7.903 0.000 -14.240 -8.579
antpast -0.2382 0.115 -2.063 0.039 -0.465 -0.012
antfuture -2.8803 0.731 -3.939 0.000 -4.314 -1.447
Party 0.0407 0.029 1.392 0.164 -0.017 0.098
nlets 1.656e-05 1.53e-06 10.820 0.000 1.36e-05 1.96e-05
lmodal 0.0054 0.002 3.022 0.003 0.002 0.009
year 0.0066 0.001 9.148 0.000 0.005 0.008
Omnibus: 37.409 Durbin-Watson: 1.717
Prob(Omnibus): 0.000 Jarque-Bera (JB): 38.225
Skew: -0.222 Prob(JB): 5.01e-09
Kurtosis: 2.963 Cond. No. 2.83e+06


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.83e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
 
In [54]:
features = ['nfpast', 'nfpresent', 'nfuture', 'Party', 'nlets', 'lmodal', 'year']
In [55]:
X = df2[features]
X = sm.add_constant(X)
sm_model = sm.OLS(y, X).fit()
pred = sm_model.predict(X)
sm_model.summary()
Out[55]:
OLS Regression Results
Dep. Variable: log_cites R-squared: 0.174
Model: OLS Adj. R-squared: 0.172
Method: Least Squares F-statistic: 139.0
Date: Tue, 16 Jul 2019 Prob (F-statistic): 1.64e-186
Time: 08:35:13 Log-Likelihood: -6515.0
No. Observations: 4637 AIC: 1.305e+04
Df Residuals: 4629 BIC: 1.310e+04
Df Model: 7
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const -15.2253 1.520 -10.015 0.000 -18.206 -12.245
nfpast 0.0009 0.000 3.053 0.002 0.000 0.002
nfpresent 0.0005 0.000 1.347 0.178 -0.000 0.001
nfuture -0.0045 0.003 -1.346 0.178 -0.011 0.002
Party 0.0403 0.029 1.378 0.168 -0.017 0.098
nlets 1.046e-05 3.35e-06 3.120 0.002 3.89e-06 1.7e-05
lmodal 0.0036 0.002 1.884 0.060 -0.000 0.007
year 0.0085 0.001 10.946 0.000 0.007 0.010
Omnibus: 37.714 Durbin-Watson: 1.706
Prob(Omnibus): 0.000 Jarque-Bera (JB): 38.541
Skew: -0.223 Prob(JB): 4.28e-09
Kurtosis: 2.973 Cond. No. 2.91e+06


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.91e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

Feature Importance

In [56]:
from sklearn.preprocessing import Imputer
from sklearn.model_selection import  cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor

from skll.metrics import spearman

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

import warnings
In [57]:
RANDOM_STATE=1234
N_JOBS=8
# the modeling pipeline
pipe = Pipeline([("imputer", Imputer()),
                 ("estimator", RandomForestRegressor(random_state=RANDOM_STATE))])
/home/xhta/anaconda3/lib/python3.5/site-packages/sklearn/utils/deprecation.py:58: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.
  warnings.warn(msg, category=DeprecationWarning)
In [58]:
spearman_scorer = make_scorer(spearman)
# the hyperparamters to search over, including different imputation strategies
rf_param_space = {
    'imputer__strategy': Categorical(['mean', 'median', 'most_frequent']),
    'estimator__max_features': Integer(1, 5),   # was Integer(1, 8),
    'estimator__n_estimators': Integer(50, 60),    # was Integer(50, 500)
    'estimator__min_samples_split': Integer(70, 85),  # was Integer(2, 200)
}
# create our search object
search = BayesSearchCV(pipe, 
                      rf_param_space, 
                      cv=10,
                      n_jobs=N_JOBS, 
                      verbose=0, 
                      error_score=-9999, 
                      scoring=spearman_scorer, 
                      random_state=RANDOM_STATE,
                      return_train_score=True, 
                      n_iter=75)
In [66]:
import pickle
df = pickle.load(open("/home/xhta/Robot/proj/pj_df_full.20190629_095112.pkl", "rb"))

df2 = df [ (df['Party'] == 'Republican' ) | (df['Party'] == 'Democratic' ) ]

df2.Party.replace(['Republican', 'Democratic'], [1, 0], inplace = True)

from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df2, test_size = 0.7, random_state = 1234)
/home/xhta/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py:6586: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
In [59]:
features = ['npast', 'npresent', 'Party', 'nlets', 'lmodal', 'year']
In [60]:
# attention, search can take some time
import time
start_time = time.time()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    search.fit(X_train[features], X_train['log_cites']) 
print (time.time() - start_time)
493.59160709381104
In [61]:
search.best_params_
Out[61]:
{'estimator__max_features': 4,
 'estimator__min_samples_split': 85,
 'estimator__n_estimators': 50,
 'imputer__strategy': 'mean'}
In [62]:
# CV score
search.best_score_
Out[62]:
0.5266693426366235
In [63]:
# CV standard deviation
search.cv_results_['std_test_score'][search.best_index_]
Out[63]:
0.04923320254936652
In [64]:
estimator = search.best_estimator_.named_steps['estimator']
imputer = search.best_estimator_.named_steps['imputer']

estimator.feature_importances_
Out[64]:
array([0.09543089, 0.3010701 , 0.0043026 , 0.26635111, 0.05147272,
       0.28137259])
In [65]:
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
In [66]:
X = df2[features]
In [68]:
# get the feature importances from each tree and then visualize the
# distributions as boxplots
all_feat_imp_df = pd.DataFrame(data=[tree.feature_importances_ for tree in 
                                     estimator],
                               columns=list(X.columns))

(sns.boxplot(data=all_feat_imp_df)
        .set(title='Feature Importance Distributions',
             ylabel='Importance'))
plt.show()
In [69]:
%%time
import xgboost as xgb
xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, best_score=0.5, objective='binary:logistic',  random_state=1234)
CPU times: user 25.3 ms, sys: 4.01 ms, total: 29.3 ms
Wall time: 502 ms
In [70]:
xgc.fit(X_train[features], X_train['case_reversed'])
Out[70]:
XGBClassifier(base_score=0.5, best_score=0.5, booster='gbtree',
       colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=1234,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)
In [71]:
pred = xgc.predict(X_test[features])
In [72]:
print (pred[0:10], X_test['case_reversed'][0:10])
[1 0 1 1 1 1 1 1 0 1] caseid
X3RPPG    1
X3BB0Q    1
X3N0DK    1
X40EIV    1
X47NQL    1
X32VPV    1
X3RLGT    1
X3OSJV    0
X3H44B    0
X3HV6D    0
Name: case_reversed, dtype: int64
In [73]:
fig = plt.figure(figsize  = (16,12))
title = fig.suptitle("Default Feature Importance from XGBoost", fontsize=14)

ax1 = fig.add_subplot(2,2,1)
xgb.plot_importance(xgc, importance_type = 'weight', ax = ax1)
t = ax1.set_title("Feature Importance - Feature Weight")

ax2 = fig.add_subplot(2,2,2)
xgb.plot_importance(xgc, importance_type = 'gain', ax = ax2)
t = ax2.set_title("Feature Importance - Split Mean Gain")
                      
ax3 = fig.add_subplot(2,2,3)
xgb.plot_importance(xgc, importance_type = 'cover', ax = ax3)
t = ax3.set_title("Feature Importance - Sample Coverage")

Global interpretation with Skater

In [74]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
In [ ]:
#Create an interpretation object
In [75]:
interpreter = Interpretation(training_data=X_test[features], training_labels=X_test['case_reversed'], feature_names=features)
im_model = InMemoryModel(xgc.predict_proba, examples=X_train[features], target_names=['not reverted', 'reverted'])
In [76]:
plots = interpreter.feature_importance.plot_feature_importance(im_model, ascending=True, n_samples=1000)
2019-07-16 09:14:30,113 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly 
faster runs, do progress_bar=False
[6/6] features ████████████████████ Time elapsed: 2 seconds

Local interpretation with Skater LIME

In [77]:
xgc_np = xgb.XGBClassifier(n_estimators=500, map_depth=5, base_score=0.5, objective = 'binary:logistic', random_state=1234)
xgc_np.fit(X_train[features].values, X_train['case_reversed'])
Out[77]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       map_depth=5, max_delta_step=0, max_depth=3, min_child_weight=1,
       missing=None, n_estimators=500, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=1234, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)
In [78]:
from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer
#exp = LimeTabularExplainer(X_test[features], feature_names= list(X_test[features].columns), discretize_continuous = False, class_names=['not reverted', 'reverted'])
exp = LimeTabularExplainer(X_test[features], feature_names= ['npast', 'npresent', 'Party', 'nlets', 'lmodal', 'year'], discretize_continuous = False, class_names=['not reverted', 'reverted'])
In [79]:
features
Out[79]:
['npast', 'npresent', 'Party', 'nlets', 'lmodal', 'year']
In [80]:
import numpy as np
print('Actual Label:', X_test['case_reversed'][0])
print('Predicted Label:', pred[0])
#exp.explain_instance((X_test[features]).iloc[0], xgc_np.predict_proba).show_in_notebook()
exp.explain_instance(data_row = X_train[features].iloc[0], predict_fn=xgc_np.predict_proba).show_in_notebook()
Actual Label: 1
Predicted Label: 1
In [82]:
import numpy as np
print('Actual Label:', X_test['case_reversed'][1])
print('Predicted Label:', pred[1])
#exp.explain_instance((X_test[features]).iloc[0], xgc_np.predict_proba).show_in_notebook()
exp.explain_instance(data_row = X_train[features].iloc[1], predict_fn=xgc_np.predict_proba).show_in_notebook()
Actual Label: 1
Predicted Label: 0
In [ ]: